{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "439dadbc",
   "metadata": {},
   "source": [
    "### 1 movieLens部分数据展示"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "c062d709",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "c699a8fb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>Adventure|Animation|Children|Comedy|Fantasy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Jumanji (1995)</td>\n",
       "      <td>Adventure|Children|Fantasy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Grumpier Old Men (1995)</td>\n",
       "      <td>Comedy|Romance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>Waiting to Exhale (1995)</td>\n",
       "      <td>Comedy|Drama|Romance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>Father of the Bride Part II (1995)</td>\n",
       "      <td>Comedy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>Heat (1995)</td>\n",
       "      <td>Action|Crime|Thriller</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>Sabrina (1995)</td>\n",
       "      <td>Comedy|Romance</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>Tom and Huck (1995)</td>\n",
       "      <td>Adventure|Children</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>Sudden Death (1995)</td>\n",
       "      <td>Action</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>GoldenEye (1995)</td>\n",
       "      <td>Action|Adventure|Thriller</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   movieId                               title  \\\n",
       "0        1                    Toy Story (1995)   \n",
       "1        2                      Jumanji (1995)   \n",
       "2        3             Grumpier Old Men (1995)   \n",
       "3        4            Waiting to Exhale (1995)   \n",
       "4        5  Father of the Bride Part II (1995)   \n",
       "5        6                         Heat (1995)   \n",
       "6        7                      Sabrina (1995)   \n",
       "7        8                 Tom and Huck (1995)   \n",
       "8        9                 Sudden Death (1995)   \n",
       "9       10                    GoldenEye (1995)   \n",
       "\n",
       "                                        genres  \n",
       "0  Adventure|Animation|Children|Comedy|Fantasy  \n",
       "1                   Adventure|Children|Fantasy  \n",
       "2                               Comedy|Romance  \n",
       "3                         Comedy|Drama|Romance  \n",
       "4                                       Comedy  \n",
       "5                        Action|Crime|Thriller  \n",
       "6                               Comedy|Romance  \n",
       "7                           Adventure|Children  \n",
       "8                                       Action  \n",
       "9                    Action|Adventure|Thriller  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_movies = pd.read_csv('movies.csv')\n",
    "data_movies[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "00b67771",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   userId  movieId  rating  timestamp\n",
      "0       1        1     4.0  964982703\n",
      "1       1        3     4.0  964981247\n",
      "2       1        6     4.0  964982224\n",
      "3       1       47     5.0  964983815\n",
      "4       1       50     5.0  964982931\n",
      "5       1       70     3.0  964982400\n",
      "6       1      101     5.0  964980868\n",
      "7       1      110     4.0  964982176\n",
      "8       1      151     5.0  964984041\n",
      "9       1      157     5.0  964984100\n"
     ]
    }
   ],
   "source": [
    "data_rating = pd.read_csv('ratings.csv')\n",
    "print(data_rating[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "937994f1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>tag</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>60756</td>\n",
       "      <td>funny</td>\n",
       "      <td>1445714994</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>60756</td>\n",
       "      <td>Highly quotable</td>\n",
       "      <td>1445714996</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>60756</td>\n",
       "      <td>will ferrell</td>\n",
       "      <td>1445714992</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>89774</td>\n",
       "      <td>Boxing story</td>\n",
       "      <td>1445715207</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>89774</td>\n",
       "      <td>MMA</td>\n",
       "      <td>1445715200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2</td>\n",
       "      <td>89774</td>\n",
       "      <td>Tom Hardy</td>\n",
       "      <td>1445715205</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>2</td>\n",
       "      <td>106782</td>\n",
       "      <td>drugs</td>\n",
       "      <td>1445715054</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>2</td>\n",
       "      <td>106782</td>\n",
       "      <td>Leonardo DiCaprio</td>\n",
       "      <td>1445715051</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>2</td>\n",
       "      <td>106782</td>\n",
       "      <td>Martin Scorsese</td>\n",
       "      <td>1445715056</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>7</td>\n",
       "      <td>48516</td>\n",
       "      <td>way too long</td>\n",
       "      <td>1169687325</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userId  movieId                tag   timestamp\n",
       "0       2    60756              funny  1445714994\n",
       "1       2    60756    Highly quotable  1445714996\n",
       "2       2    60756       will ferrell  1445714992\n",
       "3       2    89774       Boxing story  1445715207\n",
       "4       2    89774                MMA  1445715200\n",
       "5       2    89774          Tom Hardy  1445715205\n",
       "6       2   106782              drugs  1445715054\n",
       "7       2   106782  Leonardo DiCaprio  1445715051\n",
       "8       2   106782    Martin Scorsese  1445715056\n",
       "9       7    48516       way too long  1169687325"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_tag = pd.read_csv('tags.csv')\n",
    "data_tag[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cba41f5c",
   "metadata": {},
   "source": [
    "### 2 协同过滤(CF)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5366a714",
   "metadata": {},
   "source": [
    "#### 2.1 余弦相似度"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "b84588e5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1.        , 0.81649658],\n",
       "       [0.81649658, 1.        ]])"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "i = [1, 0, 0, 0]\n",
    "j = [1, 0.5, 0.5, 0]\n",
    "cosine_similarity([i, j])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e98a887d",
   "metadata": {},
   "source": [
    "#### 2.2 皮尔逊相关系数\n",
    "相比余弦相似度，皮尔逊相关系数通过使用用户平均分对个独立评分进行修正， 减少了用户评分偏置的影响"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "ee7c5b18",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(0.8164965809277258, 0.1835034190722742)\n"
     ]
    }
   ],
   "source": [
    "from scipy.stats import pearsonr\n",
    "i = [1, 0, 0, 0]\n",
    "j = [1, 0.5, 0.5, 0]\n",
    "res = pearsonr(i, j)\n",
    "print(res)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c00a0de9",
   "metadata": {},
   "source": [
    "上述2个方法用来计算两个向量之间的相似程度"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "18cb94e4",
   "metadata": {},
   "source": [
    "#### 2.3 最终结果预测"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eac78ad6",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "28e1f8f0",
   "metadata": {},
   "source": [
    "### 参考资料\n",
    "* https://blog.csdn.net/wuzhongqiang/article/details/107891787\n",
    "* https://www.bilibili.com/video/BV12q4y1f7Mv?p=4&share_source=copy_web&vd_source=6164cc1e15b15d47186e6ecfe12edef8"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
